# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# KNN – K-nearest neighbor classification

# Data – miles per gallon and other variables from the Auto data set.

> library(ISLR) # This library contains datasets from our textbook (ISLR = name of our text)

> attach(Auto)

> names(Auto) # List of variables in this dataset

[1] "mpg" "cylinders" "displacement" "horsepower" "weight" "acceleration" "year" "origin" "name"

> summary(mpg) # Economy rating will be defined based on miles per gallon

Min. 1st Qu. Median Mean 3rd Qu. Max.

9.00 17.00 22.75 23.45 29.00 46.60

# Initiate a fuel consumption rating variable that will be treated as categorical

> Economy = rep("Gas consumption", length(mpg))

> Economy[mpg <= 17] = "Heavy"

> Economy[mpg > 17 & mpg <= 22.75] = "OK"

> Economy[mpg > 22.75 & mpg <= 29] = "Eco"

> Economy[mpg > 29] = "Excellent"

> table(Economy) # We used sample quartiles of variable mpg to define these ratings,

Economy # that’s why we got four approximately equal groups.

Eco Excellent Heavy OK # Now, we’ll derive a classification rule, using other car characteristics

101 95 99 97

# Prepare training and testing data, predictors (X) and responses (Y)

> n = length(mpg)

> Z = sample(n, n/2) # We’ll split data at random

> Auto.training = Auto[Z, ] # Subsample with indices from that subsample Z

> Auto.testing = Auto[-Z, ] # Notice the “minus” sign to denote all indices except those in Z

> dim(Auto)

[1] 392 9

> dim(Auto.training)

[1] 196 9

> dim(Auto.testing)

[1] 196 9

> names(Auto)

[1] "mpg" "cylinders" "displacement" "horsepower" "weight" "acceleration" "year" "origin" "name"

# KNN in R requires 4 inputs: training X, testing X, training Y, and K.

> X.training = Auto.training[ , 2:7 ] # Take columns (variables) 2-7. That’s from cylinders to year.

> X.testing = Auto.testing[ , 2:7 ]

> Y.training = Economy[ Z ]

> Y.testing = Economy[ -Z ]

# KNN tool is in the package “class”.

> library(class)

> knn.result = knn( X.training, X.testing, Y.training, 3 )

> table( Y.testing, knn.result )

knn.result

Y.testing Eco Excellent Heavy OK

Eco 19 17 1 11

Excellent 9 35 0 2

Heavy 0 0 38 8

OK 5 3 5 32

> mean( Y.testing == knn.result )

[1] 0.6702703 # 67% correct classification rate with K=3. Is there a better K?

# We’ll check all K from 1 to 20.

> class.rate = rep(0,20) # Create a vector of length 20 and fill it with classification rates,

# computed in a do-loop

> for (K in 1:20) {

+ knn.result = knn( X.training, X.testing, Y.training, K )

+ class.rate[K]=mean( Y.testing == knn.result )

+ }

> class.rate

# Apparently, K=6 and K=8 provide a slightly better prediction although still not as good as LDA

[1] 0.6378378 0.6378378 0.6702703 0.6810811 0.6810811 0.6918919 0.6702703

[8] 0.6918919 0.6648649 0.6648649 0.6594595 0.6594595 0.6756757 0.6702703

[15] 0.6702703 0.6864865 0.6702703 0.6702703 0.6702703 0.6702703